package com.kdtech.analyse.outside;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import com.kdtech.crawler.CrawlHTML;

import com.kdtech.crawler.at.UrlArgumentTop;
import com.kdtech.entity.crawler.UrlMeta;
import com.kdtech.entity.data.NewsMeta;
import com.kdtech.analyse.AnalyseNews;
import com.kdtech.utils.DateUtils;
import com.kdtech.utils.HtmlCleaner;
import com.kdtech.utils.RegexUtils;
import com.kdtech.utils.StringUtils;

/**
 * @name 南方快报
 * @link http://www.southnews.com.tw/web-tv/00/00351.htm
 * @author dhy
 */
public class SouthnewsOutsideAnalyse implements AnalyseNews {

	private static final String[] regex={
		"http://www.southnews.com.tw/.*/[0-9]*/[0-9]*.htm",
	};

	
	public boolean isDetailPage(String url) {
		return RegexUtils.matchAnyIgnoreCase(url, regex);
	}

	
	public NewsMeta parserHtml(UrlMeta urlMeta) {
		NewsMeta news=new NewsMeta();
		if (urlMeta.getHtml() == null) {
		}
		String htmltxt=urlMeta.getHtml();
		String url=urlMeta.getUrl();

		news.setUrl(url);
		String title=null;
		String content=null;
		String author=null;
		Long date=null;
		Document doc=Jsoup.parse(htmltxt);
		
		title =doc.select("font[color=#D20000]").text();
		if(StringUtils.isBlank(title))
			title =doc.select("font.edit_subject02").text();
		if(StringUtils.isBlank(title)){
			title =doc.select("title").text();
			if(StringUtils.isNotBlank(title)){
				title = StringUtils.substringAfterLast(title, "-");
			}
		}
		
		date =DateUtils.matchDate(doc.select("html body div center table tbody tr td div table tbody tr td div table tbody tr td div table tbody tr td div table tbody tr td div table tbody tr td div table tbody tr td p font b").text());
		if(date==null)
			date =DateUtils.matchDate(doc.select("html body div center table tbody tr td div table tbody tr td div table tbody tr td div table tbody tr td div table tbody tr td div table tbody tr td div table tbody tr td p").text());
		if(date==null)
			date =DateUtils.matchDate(doc.select("html body div table tbody tr td div center table tbody tr td div table tbody tr td div table tbody tr td div table tbody tr td div table tbody tr td p font").text());
		if(date==null)
			date =DateUtils.matchDate(doc.select("font[color=#400000]").text());
		if(date==null)
			date =DateUtils.matchDate(doc.select("html body div center table tbody tr td div table tbody tr td div table tbody tr td div table tbody tr td div table tbody tr td table tbody tr td div table tbody tr td p font").text());
		if(date==null)
			date =DateUtils.matchDate(doc.select("font[size=2]").text());
		
		content=HtmlCleaner.getContentHtml(url,doc.select("html body div center table tbody tr td div table tbody tr td div table tbody tr td div table tbody tr td div table tbody tr td div table"));
		if(StringUtils.isBlank(content))
			content=HtmlCleaner.getContentHtml(url,doc.select("html body div table tbody tr td div center table tbody tr td div table tbody tr td div table tbody tr td div table tbody tr td p"));
		if(StringUtils.isBlank(content))
			content=HtmlCleaner.getContentHtml(url,doc.select("html body div table tbody tr td div center table tbody tr td div table tbody tr td div table tbody tr td p"));
		
		news.setTitle(StringUtils.trimSpace(title));
		news.setContent(content);
		news.setAuthor(author);
		news.setDate(date);

		return news;
	}

	

	

	
}
